iscream

Make fast and efficient BED file queries

James Eapen

June 25, 2025

iscream overview

  • All in R
  • Supported by C++, htslib, Armadillo
  • All operations a single line of code
  • Zero-copy modifications by reference where possible
  • Built-in parallelism, respects environment limits

Make tabix queries

files <- list.files("*.bed.gz")
regions <- read.table(
  "regions.bed",
  sep = "\t",
  col.names = c("chr", "start", "end")
)

query <- tabix(files, regions, col.names = "A", "B", "C")
query.with_colnames <- tabix(files, GRanges(regions))
            chr     start       end     A     B     C    file
         <char>     <int>     <int> <num> <int> <int>  <char>
      1:      1   4785488   4785488     0     0     2 cell_01
      2:      1   4785513   4785513     0     0     2 cell_01
      3:      1   4785522   4785522     0     0     2 cell_01
      4:      1   4785533   4785533     0     0     2 cell_01
      5:      1   4786780   4786780   100     1     0 cell_01
     ---                                                     
2201516:      X 168673020 168673020   100     1     0 cell_30
2201517:      X 168673032 168673032     0     0     1 cell_30
2201518:      X 168673164 168673164     0     0     1 cell_30
2201519:      X 168674367 168674367   100     1     0 cell_30
2201520:      X 168675047 168675047   100     1     0 cell_30

Make matrices from data columns

mat <- make_mat(files, regions, column = 4)

mat <- make_mat(files, regions, column = 4, sparse = T)

mat <- make_mat(files, regions, column = 4, sparse = T, prealloc = 50000)
> head(mat$value)
6 x 30 sparse Matrix of class "dgCMatrix"
  [[ suppressing 30 column names ‘cell_01’, ‘cell_02’, ‘cell_03’ ... ]]
                                                                
[1,] . . . . . . . . . . . . . . . . 2 . . . 1 . 1 . 1 . . 1 . 1
[2,] . . . . . . . . . . . 2 . . . . 1 . 1 . 1 . 1 . 1 . . 1 . 2
[3,] . . . . . . . . . . . . . . . . . . . . 1 . 2 . 1 . . . . .
[4,] . . . . . . . . . . . 1 . . . . . . . . 1 . 1 . 1 . . . . .
[5,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
[6,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Summarize data columns

sum, mean, median, variance, SD, min, max, range, count

summary <- summarize_regions(files, regions, column = 4)

means <- summarize_regions(files, regions, column = 4:6, fun = "mean")

min_max <- summarize_regions(
  files,
  regions,
  column = 4,
  col_names = c("A", "B"),
  fun = c("min", "max")
)
> head(min_max)
            Feature    file A.min B.min A.max B.max
1 1:3669498-3673498 cell_01    NA    NA    NA    NA
2 1:4407241-4411241 cell_01    NA    NA    NA    NA
3 1:4494413-4498413 cell_01    NA    NA    NA    NA
4 1:4783739-4787739 cell_01     0     0     1     2
5 1:4805823-4809823 cell_01     0     0     1     2
6 1:4855814-4859814 cell_01     0     0     1     2